{
 "metadata": {
  "name": "",
  "signature": "sha256:2339f9b9331c1e7f79753f37c7ed2f30bb5197835d4a6595222c295c9148ee5f"
 },
 "nbformat": 3,
 "nbformat_minor": 0,
 "worksheets": [
  {
   "cells": [
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "#Model Assessment and Selection\n",
      "##Ideas and Examples of Trade-offs\n",
      "Scott Hendrickson\n",
      "\n",
      "2015 April 3\n",
      "\n",
      "(With very heavy borrowing from ESL!)"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "import numpy as np\n",
      "import matplotlib.pyplot as plt \n",
      "%matplotlib inline\n",
      "from IPython.display import Image"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "Image(filename=\"Selection_003.png\", width=600)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "Image(filename=\"Selection_001.png\", width=600)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "Image(filename=\"Selection_004.png\")"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "Image(filename=\"Selection_002.png\", width=600)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "###In real life, things are messier..."
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "from sklearn.neighbors import KNeighborsClassifier\n",
      "from sklearn.neighbors import KNeighborsRegressor\n",
      "from sklearn.linear_model import LinearRegression\n",
      "from sklearn.linear_model import Lasso\n",
      "from sklearn.linear_model import LassoCV\n",
      "from sklearn.linear_model import Ridge\n",
      "from sklearn import cross_validation\n",
      "from sklearn import datasets\n",
      "from sklearn.cross_validation import cross_val_predict\n",
      "from sklearn import linear_model"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# boston house prices data set\n",
      "boston = datasets.load_boston()\n",
      "print boston.keys()\n",
      "#print boston.feature_names\n",
      "print boston.DESCR\n",
      "print boston.target[:10]"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "###KNN regression to for housing price prediction..."
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# Set up many trials on same data set to understand \n",
      "#    test vs. sample error\n",
      "#    variance in estimates\n",
      "n_trials, n_max_neighbors = 50, 100\n",
      "err_train, err_test = np.zeros(n_max_neighbors-2), np.zeros(n_max_neighbors-2)\n",
      "for i in range(n_trials):\n",
      "    X_train, X_test, Y_train, Y_test = cross_validation.train_test_split(boston.data, boston.target, test_size=0.3)\n",
      "    x_complexity, y_train, y_test = [], [], []\n",
      "    for k in range(n_max_neighbors, 2, -1):\n",
      "        clf = KNeighborsRegressor(n_neighbors=k)\n",
      "        clf.fit(X_train,Y_train)\n",
      "        x_complexity.append(k)\n",
      "        y_tmp = clf.score(X_train,Y_train)\n",
      "        y_train.append(y_tmp)\n",
      "        err_train[n_max_neighbors-k] += y_tmp\n",
      "        y_tmp = clf.score(X_test,Y_test)\n",
      "        y_test.append(y_tmp)\n",
      "        err_test[n_max_neighbors-k] += y_tmp\n",
      "    plt.plot(x_complexity, y_test, color=\"blue\", alpha=0.2)\n",
      "    plt.plot(x_complexity, y_train, color=\"red\", alpha=0.2)\n",
      "plt.plot(x_complexity, err_test/n_trials, color=\"blue\")\n",
      "plt.plot(x_complexity, err_train/n_trials, color=\"red\")\n",
      "plt.xlabel(\"Number of Neighbors\")\n",
      "plt.ylabel(\"Fraction Correct\")\n",
      "plt.show()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "Image(filename=\"Selection_005.png\")"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "The last term scales as the 1/number of nearest neighbors.   As the complexity of the fitting model goes up (lower k!) the variance of the model goes up.  Large k averages over a larger data set, smoothing our predictions.\n",
      "\n",
      "###Compare other comlexity... Linear"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# simple single value gut check\n",
      "lr = LinearRegression()\n",
      "boston = datasets.load_boston()\n",
      "y = boston.target\n",
      "# cross_val_predict returns an array of the same size as `y` where each entry\n",
      "# is a prediction obtained by cross validation\n",
      "predicted = cross_val_predict(lr, boston.data, y, cv=10)\n",
      "fig,ax = plt.subplots()\n",
      "ax.scatter(y, predicted)\n",
      "ax.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=4)\n",
      "ax.set_xlabel('Measured')\n",
      "ax.set_ylabel('Predicted')\n",
      "plt.show()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "Image(filename=\"Screen Shot 2.png\", width=600)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "##############################################################################\n",
      "# LassoCV: coordinate descent\n",
      "diabetes = datasets.load_diabetes()\n",
      "X = diabetes.data\n",
      "y = diabetes.target\n",
      "\n",
      "rng = np.random.RandomState(42)\n",
      "X = np.c_[X, rng.randn(X.shape[0], 14)]  # add some bad features\n",
      "\n",
      "# normalize data as done by Lars to allow for comparison\n",
      "X /= np.sqrt(np.sum(X ** 2, axis=0))\n",
      "\n",
      "# Compute paths\n",
      "print(\"Computing regularization path using the coordinate descent lasso...\")\n",
      "model = LassoCV(cv=20).fit(X, y)\n",
      "\n",
      "# Display results\n",
      "m_log_alphas = -np.log10(model.alphas_)\n",
      "\n",
      "plt.figure()\n",
      "ymin, ymax = 2300, 3800\n",
      "plt.plot(m_log_alphas, model.mse_path_, ':')\n",
      "plt.plot(m_log_alphas, model.mse_path_.mean(axis=-1), 'k',\n",
      "         label='Average across the folds', linewidth=2)\n",
      "plt.axvline(-np.log10(model.alpha_), linestyle='--', color='k',\n",
      "            label='alpha: CV estimate')\n",
      "\n",
      "plt.legend()\n",
      "\n",
      "plt.xlabel('-log(alpha)')\n",
      "plt.ylabel('Mean square error')\n",
      "plt.title('Mean square error on each fold: coordinate descent ')\n",
      "plt.axis('tight')\n",
      "plt.ylim(ymin, ymax)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "###Now on the boston home price data..."
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# is L1\n",
      "n_trials, n_alpha = 30, 80\n",
      "err_train, err_test = np.zeros(n_alpha), np.zeros(n_alpha)\n",
      "for k in range(n_trials):\n",
      "    X_train, X_test, Y_train, Y_test = cross_validation.train_test_split(boston.data, boston.target, test_size=0.3)\n",
      "    x_complexity, y_train, y_test = [], [], []\n",
      "    for i in range(n_alpha):\n",
      "        a = 5*(i+1)/float(n_alpha)\n",
      "        clf = Lasso(alpha=a)\n",
      "        clf.fit(X_train,Y_train)\n",
      "        x_complexity.append(-np.log(a))\n",
      "        y_tmp = clf.score(X_train,Y_train)\n",
      "        y_train.append(y_tmp)\n",
      "        err_train[i] += y_tmp\n",
      "        y_tmp = clf.score(X_test,Y_test)\n",
      "        y_test.append(y_tmp)\n",
      "        err_test[i] += y_tmp\n",
      "    plt.plot(x_complexity, y_test, color=\"blue\", alpha=0.2)\n",
      "    plt.plot(x_complexity, y_train, color=\"red\", alpha=0.2)\n",
      "plt.plot(x_complexity, err_test/n_trials, color=\"blue\")\n",
      "plt.plot(x_complexity, err_train/n_trials, color=\"red\")\n",
      "plt.xlabel(\"-log(alpha)\")\n",
      "plt.ylabel(\"Fraction Correct\")\n",
      "plt.show()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# simple single value gut check\n",
      "lr = Lasso(alpha=5*1/float(40))\n",
      "boston = datasets.load_boston()\n",
      "y = boston.target\n",
      "# cross_val_predict returns an array of the same size as `y` where each entry\n",
      "# is a prediction obtained by cross validation\n",
      "predicted = cross_val_predict(lr, boston.data, y, cv=10)\n",
      "fig,ax = plt.subplots()\n",
      "ax.scatter(y, predicted)\n",
      "ax.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=4)\n",
      "ax.set_xlabel('Measured')\n",
      "ax.set_ylabel('Predicted')\n",
      "plt.show()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "Image(filename=\"Screen Shot 1.png\", width=600)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# is L2\n",
      "n_trials, n_alpha = 20, 80\n",
      "err_train, err_test = np.zeros(n_alpha), np.zeros(n_alpha)\n",
      "for k in range(n_trials):\n",
      "    X_train, X_test, Y_train, Y_test = cross_validation.train_test_split(boston.data, y, test_size=0.3)\n",
      "    x_complexity, y_train, y_test = [], [], []\n",
      "    for i in range(n_alpha):\n",
      "        a = 5*(i+1)/float(n_alpha)\n",
      "        clf = Ridge(alpha=a)\n",
      "        clf.fit(X_train,Y_train)\n",
      "        x_complexity.append(-np.log(a))\n",
      "        y_tmp = clf.score(X_train,Y_train)\n",
      "        y_train.append(y_tmp)\n",
      "        err_train[i] += y_tmp\n",
      "        y_tmp = clf.score(X_test,Y_test)\n",
      "        y_test.append(y_tmp)\n",
      "        err_test[i] += y_tmp\n",
      "    plt.plot(x_complexity, y_test, color=\"blue\", alpha=0.2)\n",
      "    plt.plot(x_complexity, y_train, color=\"red\", alpha=0.2)\n",
      "plt.plot(x_complexity, err_test/n_trials, color=\"blue\")\n",
      "plt.plot(x_complexity, err_train/n_trials, color=\"red\")\n",
      "plt.xlabel(\"-log(alpha)\")\n",
      "plt.ylabel(\"Fraction Correct\")\n",
      "plt.show()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# simple single value gut check\n",
      "lr = Ridge(alpha=5*1/float(4))\n",
      "boston = datasets.load_boston()\n",
      "y = boston.target\n",
      "# cross_val_predict returns an array of the same size as `y` where each entry\n",
      "# is a prediction obtained by cross validation\n",
      "predicted = cross_val_predict(lr, boston.data, y, cv=10)\n",
      "fig,ax = plt.subplots()\n",
      "ax.scatter(y, predicted)\n",
      "ax.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=4)\n",
      "ax.set_xlabel('Measured')\n",
      "ax.set_ylabel('Predicted')\n",
      "plt.show()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "Image(filename=\"Screen Shot 3.png\", width=600)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "Image(filename=\"Screen Shot 4.png\", width=600)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "Next Time:\n",
      "* Better definition of DOF\n",
      "* AIC and BIC\n",
      "* ROC curves\n",
      "\n",
      "...and more!"
     ]
    }
   ],
   "metadata": {}
  }
 ]
}